package com.example.book.crawler.pageprocessor;

import com.example.book.crawler.entity.CrawlerContent;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.UUID;

public class ContentPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(5).setSleepTime(5000).setTimeOut(10000)
            .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36");

    @Override
    public void process(Page page) {

        System.out.println("code:" + page.getStatusCode());

        String content = page.getHtml().xpath("//*[@id=\"articlecontent\"]/html()").toString();

        page.putField("content", content.replace("手机看好书·尽在·无名小说手机版（m.wmtxt.com）", "").replace("精彩阅读·尽在·无名小说网（www.wmtxt.com）", "")
                .replace("精彩阅读·尽在·无名小说网（<a href=\"http://www.wmtxt.com\" target=\"_blank\">www.wmtxt.com</a>）", "")
                .replace(" 手机看书，尽在·无名小说手机版M.wmtxt.coM", "").replace("手机看好书·尽在·无名小说手机版（m.wmtxt.com）", ""));

    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {

        Spider spider = new Spider(new ContentPageProcessor());
        spider.setUUID(UUID.randomUUID().toString());
        spider.addUrl(new String[]{"https://www.wmtxt.com/0/1/2.html"});
        spider.addPipeline(new Pipeline() {
            @Override
            public void process(ResultItems resultItems, Task task) {
                String content = resultItems.get("content");
                System.out.println(">>>>" + content);
            }
        });
        spider.thread(1);
        spider.run();
    }
}
